{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b276e83b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import shutil\n",
    "import random\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0e47f0d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_path = 'melon17_split'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b344d2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = dataset_path.split('_')[0]\n",
    "print('数据集', dataset_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3387f002",
   "metadata": {},
   "outputs": [],
   "source": [
    "classes = os.listdir(dataset_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "edebd054",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(classes)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "55f19fd6",
   "metadata": {},
   "source": [
    "# 创建文件夹"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8633d9c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 创建 train 文件夹\n",
    "os.mkdir(os.path.join(dataset_path, 'train'))\n",
    "\n",
    "# 创建 test 文件夹\n",
    "os.mkdir(os.path.join(dataset_path, 'val'))\n",
    "\n",
    "# 在 train 和 test 文件夹中创建各类别子文件夹\n",
    "for fruit in classes:\n",
    "    os.mkdir(os.path.join(dataset_path, 'train', fruit))\n",
    "    os.mkdir(os.path.join(dataset_path, 'val', fruit))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d836d6e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_frac = 0.2  # 测试集比例\n",
    "random.seed(123) # 随机数种子，便于复现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d455dfd",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame()\n",
    "\n",
    "print('{:^18} {:^18} {:^18}'.format('类别', '训练集数据个数', '测试集数据个数'))\n",
    "\n",
    "for fruit in classes: # 遍历每个类别\n",
    "\n",
    "    # 读取该类别的所有图像文件名\n",
    "    old_dir = os.path.join(dataset_path, fruit)\n",
    "    images_filename = os.listdir(old_dir)\n",
    "    random.shuffle(images_filename) # 随机打乱\n",
    "\n",
    "    # 划分训练集和测试集\n",
    "    testset_numer = int(len(images_filename) * test_frac) # 测试集图像个数\n",
    "    testset_images = images_filename[:testset_numer]      # 获取拟移动至 test 目录的测试集图像文件名\n",
    "    trainset_images = images_filename[testset_numer:]     # 获取拟移动至 train 目录的训练集图像文件名\n",
    "\n",
    "    # 移动图像至 test 目录\n",
    "    for image in testset_images:\n",
    "        old_img_path = os.path.join(dataset_path, fruit, image)         # 获取原始文件路径\n",
    "        new_test_path = os.path.join(dataset_path, 'val', fruit, image) # 获取 test 目录的新文件路径\n",
    "        shutil.move(old_img_path, new_test_path) # 移动文件\n",
    "\n",
    "    # 移动图像至 train 目录\n",
    "    for image in trainset_images:\n",
    "        old_img_path = os.path.join(dataset_path, fruit, image)           # 获取原始文件路径\n",
    "        new_train_path = os.path.join(dataset_path, 'train', fruit, image) # 获取 train 目录的新文件路径\n",
    "        shutil.move(old_img_path, new_train_path) # 移动文件\n",
    "    \n",
    "    # 删除旧文件夹\n",
    "    assert len(os.listdir(old_dir)) == 0 # 确保旧文件夹中的所有图像都被移动走\n",
    "    shutil.rmtree(old_dir) # 删除文件夹\n",
    "    \n",
    "    # 输出每一类别的数据个数\n",
    "    print('{:^18} {:^18} {:^18}'.format(fruit, len(trainset_images), len(testset_images)))\n",
    "    \n",
    "    # 保存到表格中\n",
    "    df = df.append({'class':fruit, 'trainset':len(trainset_images), 'testset':len(testset_images)}, ignore_index=True)\n",
    "\n",
    "# 重命名数据集文件夹\n",
    "shutil.move(dataset_path, dataset_name+'_split')\n",
    "\n",
    "# 数据集各类别数量统计表格，导出为 csv 文件\n",
    "df['total'] = df['trainset'] + df['testset']\n",
    "df.to_csv('数据量统计.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c56b6a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "021d91bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "!sudo snap install tree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20295570",
   "metadata": {},
   "outputs": [],
   "source": [
    "!tree fruit81_split -L 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "555eb1da",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8975054",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
